
  
library(knitr)
knitr::opts_chunk$set(dev = "ragg_png")

library(showtext)
library(sysfonts)

# macOS default Korean font
if (!"AppleSDGothicNeo" %in% sysfonts::font_families()) {
  try(sysfonts::font_add(
    family  = "AppleSDGothicNeo",
    regular = "/System/Library/Fonts/AppleSDGothicNeo.ttc"
  ), silent = TRUE)
}

# Noto Sans KR 
if (!"notokr" %in% sysfonts::font_families()) {
  try(sysfonts::font_add_google("Noto Sans KR", "notokr"), silent = TRUE)
}
showtext_auto(TRUE)

base_kr <- if ("notokr" %in% sysfonts::font_families()) "notokr" else "AppleSDGothicNeo"

library(ggplot2)
theme_set(theme_minimal(base_family = base_kr))
update_geom_defaults("text",  list(family = base_kr))
update_geom_defaults("label", list(family = base_kr))
options(cooc_font_family = base_kr)

# locale
options(encoding = "UTF-8")
try(Sys.setlocale("LC_CTYPE", "ko_KR.UTF-8"), silent = TRUE)


library(knitr)
knitr::opts_chunk$set(dev = "ragg_png")

library(readxl)
library(readr)
library(dplyr)
library(tidyr)
library(purrr)
library(stringr)
library(tibble)
library(quanteda)
library(keyATM)
library(ggplot2)

# Keywords 
keywords <- list(
  economy = c("경제", "기본소득", "소득", "국채", "세금", "증세", "이자율", "가계부채", "구조조정", "시장", "재정", "통화"),
  budget = c("예산", "재원", "발행", "지원금", "재난지원금", "손실보상법", "불균형", "현금"),
  gender = c("여성", "여자", "페미니즘", "성평등", "성차별", "젠더", "하사", "중사", "다양성", "성범죄"),
  security_foreign = c("안보", "평화", "전쟁", "미사일", "우크라이나", "전술핵", "북한", "동맹",
                       "침공", "나토", "젤렌스키", "러시아", "외교", "국방", "한미일", "방위산업", "군사적"),
  welfare = c("청년", "저출산", "아이", "일자리", "지원", "복지", "교육", "안전망", "기본소득", "장애인"),
  daejangdong = c("대장동", "김만배", "녹취록", "수사", "화천대유", "도시개발공사", "공소장", "게이트"),
  electoral_reform = c("위성정당", "연동형", "선거법", "대표", "개헌", "선거제도", "개혁", "합당"),
  party_politics = c("정당", "민주당", "국민의힘", "후보", "단일화"),
  policy_general = c("정책", "정책에", "정책으로", "정책을", "정책이", "정책의", "공약", "공약을", "공약으로")
)

# Stopwords and preprocessing 
my_stopwords <- c(
  "이런","우리","저는","제가","지금","그런","하는","있는","때문에","그리고","그것","이제","합니다","그래서","즉","하지만","여러분",
  "말씀을","정말","이렇게","아까","하고","대해서","않습니까","것은","그런데","말씀","이미","그렇게","있습니다","많이","사실","너무",
  "하겠습니다","매우","번째","있는데","해서","한번","생각합니다","됩니다","우리가","겁니다","다른","저희가","말씀하신","것이","이게",
  "것처럼","하면","한다","드립니다","없는","정말로","저도","아니고","생각이","내가","통해서","먼저","가지고","된다","가장","위한",
  "어떻게","같습니다","되는","또는","듭니다","좋은","예를","그걸","가지","알겠습니다","드리고","해야","없습니다","계속","일단","제대로",
  "있다","그건","건지","여쭤보겠습니다","아니","사실은","맞습니다","전적으로","많은","드리고요","하지","같아요","결국","답을","식으로",
  "아마","구체적으로","이번","선택을","가지는","싶습니다","그러나","한다는","그때","저한테","그게","자체를","그때","같은","당연히",
  "있게","가능한","지금도","이해가","싶은","그러니까","특히","거죠","지금은","아닌","아니라","된다는","때는","있다고","만들고","그렇죠",
  "아주","거의","이유는","혹시","다시","바로","하게","드리는","중에","피해를","되는데","똑같은","확실하게","말씀드리겠습니다",
  "그거는","같아요","있으면","같은","그게","정도","이런","이제","다시","조금","이거","저희","일단","왜냐하면","그런데","그래서",
  "그러니까","이렇습니다","그렇습니다","있습니다","없습니다","합니다","합시다","했는데","됐습니다","있어요","없어요","어떤",
  "하셨는데","되면","후보께서","것을","얘기를","글쎄","보면","여러","그러면","거기에","아니겠습니까","대한","이거를","알고","있고",
  "것입니다","결국은","된다고","것이고","한다고","대해서는","보니까","이거는","거는","것도","나온","되고","하나","번째로","않고",
  "하는데","것이고","말씀드립니다","전에","있다는"
)

preprocess_text_vec <- function(x, my_stopwords){
  corp <- corpus(x)
  toks <- tokens(corp, remove_punct = TRUE, remove_symbols = TRUE) |>
    tokens_remove(pattern = my_stopwords) |>
    tokens_select(pattern = "^[가-힣]{2,}$", valuetype = "regex", padding = FALSE)
  dfm(toks)
}

# ----------------------------------------------------------
# Load data
# NOTE: Adjust the file paths below to match where you downloaded the data files.
# ----------------------------------------------------------
news_raw <- read_excel("PATH/TO/news_crawling.xlsx", col_names = FALSE)
lee_data <- read_csv("PATH/TO/Text_data_Lee.csv")
yoon_data <- read_csv("PATH/TO/Text_data_Yoon.csv")


# Build text vectors 
text_data <- news_raw[["...5"]] |> (\(z) z[!is.na(z)])() |> str_squish()
news_vec  <- vapply(str_split(text_data, ","), function(x) paste(x, collapse = " "), character(1))
lee_vec   <- tolower(lee_data$remarks)
yoon_vec  <- tolower(yoon_data$remarks)

# Common DFM space 
dfm_news <- preprocess_text_vec(news_vec, my_stopwords)
dfm_lee  <- preprocess_text_vec(lee_vec,  my_stopwords)
dfm_yoon <- preprocess_text_vec(yoon_vec, my_stopwords)

dfm_news <- dfm_subset(dfm_news, ntoken(dfm_news) > 0)
dfm_lee  <- dfm_subset(dfm_lee,  ntoken(dfm_lee)  > 0)
dfm_yoon <- dfm_subset(dfm_yoon, ntoken(dfm_yoon) > 0)

docnames(dfm_news) <- paste0("News_", seq_len(ndoc(dfm_news)))
docnames(dfm_lee)  <- paste0("Lee_",  seq_len(ndoc(dfm_lee)))
docnames(dfm_yoon) <- paste0("Yoon_", seq_len(ndoc(dfm_yoon)))

dfm_cand <- rbind(dfm_lee, dfm_yoon)
docnames(dfm_cand) <- paste0("Cand_", seq_len(ndoc(dfm_cand)))

dfm_all <- rbind(dfm_news, dfm_cand)

# Filter keywords to features 
filter_keywords <- function(dfm, kw){
  lapply(kw, function(ws) ws[ws %in% featnames(dfm)])
}
keywords_all <- filter_keywords(dfm_all, keywords)
keywords_all <- keywords_all[sapply(keywords_all, length) > 0]

# One KeyATM over joint space 
set.seed(123)
mod_all <- keyATM(
  docs = keyATM_read(dfm_all),
  no_keyword_topics = 0,
  keywords = keywords_all,
  model = "base",
  options = list(seed = 123, iterations = 100)
)

theta <- mod_all$theta
colnames(theta) <- names(keywords_all)

is_news <- str_starts(rownames(dfm_all), "News_")
is_cand <- str_starts(rownames(dfm_all), "Cand_")

theta_news <- theta[is_news, , drop = FALSE]
theta_cand <- theta[is_cand, , drop = FALSE]

# Bayesian bootstrap 
bayes_boot_diff <- function(x_treat, x_ctrl, B = 5000L){
  n_t <- length(x_treat); n_c <- length(x_ctrl)
  sims <- replicate(B, {
    w_t <- rgamma(n_t, 1); w_t <- w_t / sum(w_t)
    w_c <- rgamma(n_c, 1); w_c <- w_c / sum(w_c)
    sum(w_t * x_treat) - sum(w_c * x_ctrl)
  })
  tibble(
    post_mean   = mean(sims),
    post_median = median(sims),
    l95 = quantile(sims, 0.025),
    u95 = quantile(sims, 0.975),
    p_gt0 = mean(sims > 0)
  )
}

set.seed(123)
res <- map_dfr(colnames(theta), function(tp){
  out <- bayes_boot_diff(theta_cand[, tp], theta_news[, tp], B = 5000L)
  mutate(out, Topic = tp, .before = 1)
}) |>
  arrange(desc(abs(post_mean))) |>
  mutate(sig95 = if_else(l95 * u95 > 0, "Credible (95%)", "Not credible"))

# Figure 2
ggplot(res, aes(x = reorder(Topic, post_mean), y = post_mean)) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  geom_point() +
  geom_errorbar(aes(ymin = l95, ymax = u95), width = 0.15) +
  coord_flip() +
  labs(title = "Treatment effect on Topic proportions (Candidates − News)",
       x = "Topic", y = "Mean difference (95% CrI)") +
  theme_minimal()

